#Importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import plotly.express as px
from itertools import cycle
from plotly.validators.scatter.marker import SymbolValidator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report,f1_score,precision_score,recall_score
from sklearn.metrics import ConfusionMatrixDisplay,roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
#Reading csv file of Financial Transactions
credit_card = pd.read_csv('/content/FinancialTransactions.csv')
#Removing Unnamed:0 column as it is not useful for predicting fraud transactions. i.e it ia a ID column
credit_card.drop(columns={'Unnamed: 0'},inplace=True)
#Printing first 2 rows of the dataset
credit_card.head(2)
| Amount | Date | Card Type | MCC Category | Location | Device | Previous Transactions | Balance Before Transaction | Time of Day | Velocity | Customer Age | Customer Income | Card Limit | Credit Score | Merchant Reputation | Merchant Location History | Spending Patterns | Online Transactions Frequency | Is Fraudulent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 180.924993 | 2023-07-02 | Debit | Electronics | UK | Desktop | 6 | 919.055267 | 10 | -0.337955 | 52 | 105545.340543 | 2503.758986 | 401 | Average | 6 | 828.820298 | Medium | 0 |
| 1 | 794.625797 | 2023-02-18 | Prepaid | Groceries | Canada | Desktop | 5 | 3529.930762 | 17 | 0.015117 | 62 | 92651.854405 | 12885.681726 | 409 | Average | 13 | 4384.528307 | Low | 1 |
#Printing the columns of the dataset
credit_card.columns
Index(['Amount', 'Date', 'Card Type', 'MCC Category', 'Location', 'Device',
'Previous Transactions', 'Balance Before Transaction', 'Time of Day',
'Velocity', 'Customer Age', 'Customer Income', 'Card Limit',
'Credit Score', 'Merchant Reputation', 'Merchant Location History',
'Spending Patterns', 'Online Transactions Frequency', 'Is Fraudulent'],
dtype='object')
#Printing shape of the dataset - 1000 rows and 19 columns
credit_card.shape
(1000, 19)
#Showing the summary of the input dataset - Column name, Number of non-missing values count,DataType of each column
credit_card.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Amount 1000 non-null float64 1 Date 1000 non-null object 2 Card Type 1000 non-null object 3 MCC Category 1000 non-null object 4 Location 1000 non-null object 5 Device 1000 non-null object 6 Previous Transactions 1000 non-null int64 7 Balance Before Transaction 1000 non-null float64 8 Time of Day 1000 non-null int64 9 Velocity 1000 non-null float64 10 Customer Age 1000 non-null int64 11 Customer Income 1000 non-null float64 12 Card Limit 1000 non-null float64 13 Credit Score 1000 non-null int64 14 Merchant Reputation 1000 non-null object 15 Merchant Location History 1000 non-null int64 16 Spending Patterns 1000 non-null float64 17 Online Transactions Frequency 1000 non-null object 18 Is Fraudulent 1000 non-null int64 dtypes: float64(6), int64(6), object(7) memory usage: 148.6+ KB
Inference: The dataset contains 1000 records of financial transactions with each transaction containing a indicator whether the transaction is fraudulent or not. The dataset contains 18 columns which describes the characteristic of each transaction
# To analyse how transactions happen over time aggregating transactions by month will be more effective than date
credit_card['Date']=pd.to_datetime(credit_card['Date'])
credit_card['Month']=credit_card['Date'].dt.month
Distribution of DV - Is Fraudulent
plt.figure(figsize=(6,4))
sns.countplot(x='Is Fraudulent', data=credit_card)
plt.title('Fraud vs. Non-Fraud Transactions')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Number of Transactions')
plt.show()
There is a significant imbalance between the two classes. The majority of transactions are non-fraudulent (class 0), with a count close to 900. The number of fraudulent transactions (class 1) is much smaller, with a count that appears to be under 100
plt.figure(figsize=(6, 3))
sns.histplot(credit_card['Amount'], kde=True)
plt.title('Distribution of Transaction Amounts')
plt.show()
Amount involved in the transaction is visualized using a histogram to check the presence of skewness. Amount ranges from 0 to 1000 with more number of transactions happened involving amount of 600
# Scatter plot to identify outliers
plt.figure(figsize=(6, 3))
sns.scatterplot(x='Amount', y='Velocity', data=credit_card)
plt.title('Amount vs. Velocity')
plt.show()
Insights:
The distribution is somewhat uniform across the bins, but with some fluctuations. The highest count of transactions occurs around the 600-700 range
The scatter plot shows a fairly random distribution of data points, with no apparent pattern or trend between the two variables.
This randomness implies that the velocity does not vary systematically with the transaction amount in this dataset. Overall, the plot suggests that transaction amount and velocity are independent of each other, with no clear linear or nonlinear relationship visible.
# Assume df is your DataFrame and 'Is Fraudulent' is the output variable
features = ['Amount', 'Balance Before Transaction', 'Velocity', 'Customer Age', 'Customer Income']
for feature in features:
plt.figure(figsize=(6, 3))
sns.histplot(data=credit_card[credit_card['Is Fraudulent'] == 0], x=feature, color='blue', label='Non-Fraudulent', kde=True)
sns.histplot(data=credit_card[credit_card['Is Fraudulent'] == 1], x=feature, color='red', label='Fraudulent', kde=True)
plt.title(f'Histogram of {feature} by Fraudulent Status')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.legend()
plt.show()
#An analysis on Different Card Types held by customers
proc_df = credit_card.groupby(["Card Type"]).size().to_frame() # - aggregating data with respect to Card Type
proc_df.rename(columns={0:'Frequency'},inplace=True)
proc_df=proc_df.reset_index(level='Card Type')
fig=plt.figure(figsize=(10,7)) #Setting the axis for creating chart
#- Barplot function of seaborn library with x axis as Different Card types and its frequency as y axis
ax=sns.barplot(x=proc_df['Card Type'],y=proc_df['Frequency'])
for p in ax.patches:
ax.annotate(format(p.get_height(), '.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.title("Distribution of Card Type held by the customers")
plt.xlabel("Card Type",fontsize=12)
plt.ylabel("Count",fontsize=12)
Text(0, 0.5, 'Count')
# Creating a histogram on age of customers- as it helps in visualizing the distribution of the age
# Passing age column to Histogram function of plotly library as this function creates more interactive plots than matplotlib and seaborn
fig = px.histogram(credit_card, x="Customer Age", nbins=20,title='Histogram of Age of Different Customers')
fig.show()
Histogram of age of customers shows that more number of customers present in the dataset in more than 75 bin with 97 customers and the less number of customers are present in the bin of 70-74 with 62 customers. The age is uniformly distributed with no outliers
#Stacked histogram - Distribution of Age with respect to each card type
fig = px.histogram(credit_card, x="Customer Age", color="Card Type")
fig.show()
The above stacked histogram shows that distribution of age is uniform irrespective of card type
#Histogram on velocity with respect to each card type and is Fraudulent
#passing x axis as Card Type and y axis as velocity with respect to Frudulent indicator. To show the avg velocity histfunc is passes ad avg
fig = px.histogram(credit_card, x="Card Type", y='Velocity', color="Card Type", pattern_shape="Is Fraudulent",histfunc="avg")
fig.show()
The above histogram shows that in debit and prepaid cards range and avg velocity is higher for fradulent transactions on an average of 0.05 than non fraudulent transactions which has an average of 0.009. In credit card the avg velocity is around 0.03 for fraudulent transactions whereas 0.003 for non fraudulent transactions
##Bar chart - Analysis the type of card transaction used in each MCC category
#Aggregating data by MCC category and card type
#Creating x axis as MCC Category, y axis as frequency of transactions in each category with respect to each card type
proc_df = credit_card.groupby(["MCC Category","Card Type"]).size().to_frame()
proc_df.rename(columns={0:'Frequency'},inplace=True)
proc_df=proc_df.reset_index(level=['MCC Category','Card Type'])
fig = px.histogram(proc_df, x="MCC Category",y="Frequency",
color='Card Type', barmode='group',labels={'sum of Frequency':'Count','Kind':'Card type'},
title="Type of card transaction used in each MCC category",
height=400)
fig.show()
Analysis of frequency of transactions on each category with respect to each card type shows that most of the transactions are done with credit card in travel category with total of 62 transactions and less number of transaction was made in Groceries category with prepaid card with total of 35 transactions
Scatter plots are used to identify if there exists any pattern between the variables
#Scatter plot analysis on Balance Before Transaction and Amount involved in the transaction by Is Fraudulent
#This analysis is done to see if there is any pattern exists between these variables with respect to fraudulent indicator
plt.figure(figsize=(16,6))#-Scatter function of matplotlib library to create the scatter plot
scatter=plt.scatter(credit_card['Balance Before Transaction'],credit_card['Amount'],s=50,
c=credit_card['Is Fraudulent'].astype('category').cat.codes,
marker='*',linewidths=2)
plt.title('Balance before transaction vs amount involved in current transaction')
plt.xlabel('Balance Before Transaction')
plt.ylabel('Amount')
# Yes=mpatches.Patch(color='yellow',label='Yes')
# No=mpatches.Patch(color='purple',label='No')
# plt.legend(handles=[Yes,No],title="Is Fraud",ncol=2)
Text(0, 0.5, 'Amount')
The scatter plot shows that Amount involved in the transaction and Balance involved in the transaction are not associated. There is no pattern formed irrespective of the fraudulent indicator
#Scatter plot between Card Limit and Credit Score with respect to each card type
#the trendline parameter is set to os to check if any pattern is formed between these candidate IVS
fig = px.scatter(credit_card, x="Card Limit", y="Credit Score", color="Card Type",symbol="Card Type",
title='Card Limit vs Credit Score in each card type',trendline="ols", trendline_scope="overall",
hover_data=["Is Fraudulent"])
fig.update_traces(marker_size=10)
fig.show()
The scatter plot shows there is no pattern present and the card limit and credit score does not have any correlation between them
#Scatter plot analysis on Customer Income versus Spending Patterns with respect to each transaction frequency type
#To find if there is any relationship exists between spending patterns and customer income
fig = px.scatter(credit_card, x="Customer Income", y="Spending Patterns",facet_col="Online Transactions Frequency",
title='Customer income vs Spending Pattern in each category of online transaction frequency type',trendline='ols')
fig.update_traces(marker_size=10)
fig.show()
Irrespective of online transaction frequency type, data points do not form a pattern between customer income and spending patterns.
#To find if any relation exists between amount involved in the transaction and velocity with respect to each reputation type
# and Is Fraudulent
fig = px.scatter(credit_card, x="Amount", y="Velocity",facet_col="Merchant Reputation",facet_row="Is Fraudulent",
title='Amount involved in the transaction vs velocity in each merchant type',trendline='ols')
fig.update_traces(marker_size=10)
fig.show()
There is no pattern formed by the data points between amount involved in the transaction and velocity
##An analysis on amount involved in the transaction in each category of MCC over each month
# Aggregating the data by Month and MCC category. As not all categories will be present every month null values are replaced with zero
transact_=credit_card.groupby(["Month","MCC Category"])['Amount'].sum().unstack().fillna(0)
#Creating line plot using line function of plotly library with x axis as Month, y axis as amount and lines as MCC Category
fig = px.line(transact_,title="Amount spent in each category in each month")
fig.update_traces(mode="markers+lines")
fig.update_layout(
font_family="Times New Roman",
font_color="royalblue",
title_font_color="darkblue",
legend_font_color="darkblue")
fig.show()
The transaction analysis shows that travel related transactions are more in the Month of Feb to June and after November. Healthcare transactions had a great dip at the month of June
## Analysis on distribution of device used in the transaction in each city
#Calculating the no of transaction with respect to each city
frequency_df=credit_card['Location'].value_counts().to_frame()
frequency_df.rename(columns={'Location':'Transaction_Frequency'},inplace=True)
#Getting unique devices and cities present in the dataset
city_list=list(frequency_df.index)
devicetypes=list(credit_card['Device'].unique())
#Storing the no of transactions used by each device in each city in a dictionary
sorted_city={}
for i in range(len(city_list)):
city=city_list[i]
sorted_city[city]=[]
city_df=credit_card[credit_card['Location']==city]
for j in range(len(devicetypes)):
sorted_city[city].append(city_df[city_df['Device']==devicetypes[j]].shape[0])
finalcity_df=pd.DataFrame.from_dict(sorted_city,orient='index',columns=devicetypes)
#Creating a stacked bar chart from the dictionary of different types of device used in each transaction in each city
ax=finalcity_df.plot(stacked=True, kind='bar',figsize=(15,7),color=("blue","royalblue","cyan"))
for bar in ax.patches:
height = bar.get_height()
width = bar.get_width()
x = bar.get_x()
y = bar.get_y()
label_text = height
label_x = x + width / 2
label_y = y + height / 2
if label_text!=0:
ax.text(label_x, label_y, label_text, ha='center',
va='center')
plt.title("Frequency Distribution of Device used in Transaction in all cities")
plt.xticks(rotation=0,ha="center")
plt.locator_params(axis='y',nbins=20)
# Display chart - x axis as City, y axis as Number of transactions and each patch showing each type of device
plt.show()
The bar chart displays that more number of desktop transactions has happened in US and less number of transactions has happened through POS in Germany
##Analysis on trnsactions involved in each transaction type in each MCC category in each hour of day
#Data aggregation with respect to Time and MCC Category
merchant_df=credit_card.groupby(["Time of Day","MCC Category"]).size().to_frame()
merchant_df=merchant_df.reset_index(level=['Time of Day','MCC Category'])
merchant_df.rename(columns={0:'Frequency'},inplace=True)
merchant_df=merchant_df.sort_values('Frequency',ascending=True)
#Creating an area plot to show the volume of transactions involved in each category at each hour of day
#with x axis as Time and y axis as Volume of Transactions and hue as MCC category
fig = px.area(merchant_df,x='Time of Day',y='Frequency',color='MCC Category',
title="Frequency of transaction in each category at each point of time",
labels={"value":"Frequency","MCC Category":"Type of MCC Category"})
fig.update_traces(mode="markers+lines")
fig.update_layout(
font_family="Times New Roman",
font_color="royalblue",
title_font_color="darkblue",
legend_font_color="darkblue")
fig.show()
In the earlier part of the day more number of transactions happened in travel category whereas the volume of transaction involved in Entertainment is higher than other categories at the later part of the day.
# Checking if there is any inter correlation exists between the Quantitative variables of the dataset as inter correlation may affect the model performance
credit_card1=credit_card[['Amount','Balance Before Transaction','Velocity','Previous Transactions','Customer Age','Customer Income','Card Limit','Credit Score','Merchant Location History','Spending Patterns','Time of Day','Is Fraudulent']]
sns.heatmap(data = credit_card1.corr())
<Axes: >
Models like logistic regression assumes that there is no multicollinearity present between the independent variables. So in such cases it is essential to remove variables that has inter correlation between other candidate IVs. To view the correlation heatmap function of seaborn library is used and the heatmap shows that there is no multicollinearity present between the variables. All variables have a very low inter correlation value i.e below 0.3
##-Outlier Detection using Box Plot Analysis
cols=['Amount','Balance Before Transaction','Velocity','Previous Transactions','Customer Age','Customer Income','Card Limit','Credit Score','Merchant Location History','Spending Patterns','Time of Day']
for col in cols:
plt.figure(figsize=(17,1))
sns.boxplot(data=credit_card, x=col)
the presence of outliers may affect the model results drastically. So the quantitative variables are tested for the presence of outliers using BOX PLot. All the variables does not have any data point outside the IQR range. So there is no outliers present.
## Data is tested for the presence of skewness
for col in cols:
plt.figure(figsize=(6, 3))
stats.probplot(credit_card[col], dist="norm", plot=plt)
plt.title('Q-Q Plot - {}'.format(col))
plt.show()
skewness = stats.skew(credit_card[col]) # Drop NaN values if any
print(f'Skewness: {skewness}')
Skewness: -0.11748818504525273
Skewness: 0.07906457314869914
Skewness: -0.006670828175795649
Skewness: 0.026321773857642183
Skewness: 0.01698181416579776
Skewness: 0.0738174967421649
Skewness: -0.05058948966944294
Skewness: -0.04071438562859312
Skewness: 0.042327976592537585
Skewness: -0.031155329001456126
Skewness: -0.00821664809001059
The presence of skewness in the input variables affects the model. So the presence of skewness is detected using QQ Plot and skew function of stats library. Both the plot and values shows that the variables does have very mild skewness.
It is essential that proper and useful features are fed to the model. So 3 feature engineering steps are done to select the most useful features
##Label Encoding - Ordinal variables - preserving the order
custom_mapping1 = {'Bad': 0, 'Average': 1, 'Good': 2}
custom_mapping2 = {'Low': 0, 'Medium': 1, 'High': 2}
credit_card['merchant_reputation_encoded']=credit_card['Merchant Reputation'].map(custom_mapping1)
credit_card['OT_Frequency_encoded']=credit_card['Online Transactions Frequency'].map(custom_mapping2)
#Removng the original categorical vars
credit_card.drop(columns=['Online Transactions Frequency','Merchant Reputation'],inplace=True)
## One hot Encoding of Nominal Variables - creating dummy variables for each possible value
categorical_df=pd.get_dummies(credit_card,columns=['Card Type','Location','Device','MCC Category'])
#Converting boolean vars to binary variables
card_cols=[col for col in categorical_df.columns if col.startswith('Card Type')]
device_cols=[col for col in categorical_df.columns if col.startswith('Device')]
mcc_cols=[col for col in categorical_df.columns if col.startswith('MCC Category')]
location_cols=[col for col in categorical_df.columns if col.startswith('Location')]
categorical_df[card_cols]=categorical_df[card_cols].astype(int)
categorical_df[device_cols]=categorical_df[device_cols].astype(int)
categorical_df[mcc_cols]=categorical_df[mcc_cols].astype(int)
categorical_df[location_cols]=categorical_df[location_cols].astype(int)
Some variables when interacting with other variables may have more predictive power than the raw variables. So feature interactions on possible variables are done to create new variables.
## Deriving Time related variables as aggregating data at higher level may give more useful
categorical_df['Date']=pd.to_datetime(categorical_df['Date'])
categorical_df['Month']=categorical_df['Date'].dt.month
categorical_df['Day']=categorical_df['Date'].dt.day
categorical_df['Quarter']=categorical_df['Date'].dt.quarter
## Deriving domain related variables.
categorical_df['risk_score'] = (1 / categorical_df['Credit Score']) * (categorical_df['Amount'] / categorical_df['Card Limit'])
categorical_df['balance_income_ratio'] = categorical_df['Balance Before Transaction'] / categorical_df['Customer Income']
## Deriving the customer behaviour based variables
categorical_df['spending_pattern_score'] = categorical_df['Spending Patterns'] * categorical_df['Previous Transactions'] * categorical_df['Merchant Location History']
categorical_df['time_based_interaction'] = categorical_df['Time of Day'] * categorical_df['Day'] * categorical_df['Month'] * categorical_df['Quarter']
## Deriving Risk related variables for each transaction
categorical_df['risk_adjusted_limit'] = categorical_df['Card Limit'] / categorical_df['risk_score']
categorical_df['credit_merchant_risk'] = categorical_df['Credit Score'] * categorical_df['merchant_reputation_encoded']
categorical_df['geo_spending_profile'] = categorical_df[['Location_Germany', 'Location_France', 'Location_UK',
'Location_Canada', 'Location_US']].sum(axis=1) * categorical_df['Spending Patterns'] * categorical_df[['Device_Desktop', 'Device_POS', 'Device_Mobile']].sum(axis=1)
Not all variables in the model will be useful in classifying the fraudulent transaction. So subset of features are selected using their correlation with the dependent variable
# Calculate correlation with the target and sort them in the descending order
correlation = categorical_df.corrwith(credit_card['Is Fraudulent']).abs()
feature_importance = correlation.sort_values(ascending=False)
print(feature_importance)
Is Fraudulent 1.000000 Location_Germany 0.069065 Location_France 0.064545 MCC Category_Healthcare 0.061468 MCC Category_Clothing 0.052695 Location_UK 0.035462 Previous Transactions 0.034964 Merchant Location History 0.029830 Credit Score 0.029373 balance_income_ratio 0.028630 Location_Canada 0.026141 Balance Before Transaction 0.025076 geo_spending_profile 0.024766 Spending Patterns 0.024766 credit_merchant_risk 0.024541 merchant_reputation_encoded 0.024122 Card Limit 0.022873 OT_Frequency_encoded 0.022657 MCC Category_Travel 0.021569 Card Type_Credit 0.021398 Customer Income 0.019926 time_based_interaction 0.019484 Device_Desktop 0.018145 Day 0.017909 Card Type_Prepaid 0.017770 Customer Age 0.016205 Time of Day 0.015371 spending_pattern_score 0.014883 risk_adjusted_limit 0.014879 Date 0.014256 Month 0.013153 MCC Category_Groceries 0.012501 Velocity 0.012391 Location_US 0.012077 Device_POS 0.011674 MCC Category_Electronics 0.010313 risk_score 0.008063 MCC Category_Restaurants 0.006726 Device_Mobile 0.006644 MCC Category_Entertainment 0.004735 Card Type_Debit 0.004152 Quarter 0.002141 Amount 0.001613 dtype: float64
Location_Germany which is the encoded variable from Location has high correlation with the DV is Fraudulent. and Amount involved in the transaction has less correlation with the DV Is Fraudulent
column_list=list(feature_importance.index)
column_list.remove('Date')
column_list.remove('Amount')
column_list.remove('Quarter')
column_list.remove('Is Fraudulent')
column_list
['Location_Germany', 'Location_France', 'MCC Category_Healthcare', 'MCC Category_Clothing', 'Location_UK', 'Previous Transactions', 'Merchant Location History', 'Credit Score', 'balance_income_ratio', 'Location_Canada', 'Balance Before Transaction', 'geo_spending_profile', 'Spending Patterns', 'credit_merchant_risk', 'merchant_reputation_encoded', 'Card Limit', 'OT_Frequency_encoded', 'MCC Category_Travel', 'Card Type_Credit', 'Customer Income', 'time_based_interaction', 'Device_Desktop', 'Day', 'Card Type_Prepaid', 'Customer Age', 'Time of Day', 'spending_pattern_score', 'risk_adjusted_limit', 'Month', 'MCC Category_Groceries', 'Velocity', 'Location_US', 'Device_POS', 'MCC Category_Electronics', 'risk_score', 'MCC Category_Restaurants', 'Device_Mobile', 'MCC Category_Entertainment', 'Card Type_Debit']
Date variable has no predictive power with the Dv as we have created other aggregated variables to capture the timing.Is Fraudulent is the target variable and the Day and Amount are the variables with very less correlation ie below 0.02%. So except these 4 variables, the remaining subset of features are selected
As we have many new variables created from feature crossing, it is essential that we remove the variables that has multicollinearity
# Calculate the correlation matrix
correlation_matrix = categorical_df[column_list].corr()
# Select only the upper triangle of the correlation matrix (excluding the diagonal)
upper_triangle = correlation_matrix.where(
np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)
# Find the features that have a very strong inter correlation i.e greater than 0.6(above 70% is considered as strong correlation)
high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.6)]
print(high_correlation_features)
['Balance Before Transaction', 'Spending Patterns', 'merchant_reputation_encoded']
sns.heatmap(data = categorical_df[column_list].corr())
<Axes: >
column_list2=column_list.copy()
column_list2.remove('Balance Before Transaction')
column_list2.remove('Spending Patterns')
column_list2.remove('merchant_reputation_encoded')
As Balance Before Transaction, Month, Spending Patterns and Merchant Reputation have high inter correlation so these variables are removed from the candidate IV list
X=categorical_df[column_list]
Y=categorical_df['Is Fraudulent']
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier( random_state=45)
model.fit(X, Y)
# Get feature importance
importances = model.feature_importances_
importances_percent = 100 * importances
# Create a DataFrame to display feature importances
feature_importances = pd.DataFrame({
'Feature': X.columns,
'Importance': importances,
'Importance (%)': importances_percent
}).sort_values(by='Importance', ascending=False)
# Print the feature importances
print(feature_importances)
Feature Importance Importance (%) 15 Card Limit 0.067836 6.783627 7 Credit Score 0.058651 5.865065 10 Balance Before Transaction 0.056729 5.672880 8 balance_income_ratio 0.054111 5.411148 19 Customer Income 0.050586 5.058624 26 spending_pattern_score 0.050239 5.023857 34 risk_score 0.050031 5.003140 27 risk_adjusted_limit 0.048271 4.827127 11 geo_spending_profile 0.048260 4.825961 30 Velocity 0.047984 4.798414 20 time_based_interaction 0.045693 4.569340 12 Spending Patterns 0.045644 4.564355 13 credit_merchant_risk 0.040968 4.096791 24 Customer Age 0.036812 3.681176 22 Day 0.035707 3.570676 28 Month 0.034657 3.465720 25 Time of Day 0.033626 3.362622 6 Merchant Location History 0.030914 3.091442 5 Previous Transactions 0.026897 2.689691 2 MCC Category_Healthcare 0.011948 1.194797 16 OT_Frequency_encoded 0.011749 1.174907 14 merchant_reputation_encoded 0.010915 1.091520 1 Location_France 0.009363 0.936286 21 Device_Desktop 0.009066 0.906641 18 Card Type_Credit 0.007657 0.765746 33 MCC Category_Electronics 0.006980 0.697951 38 Card Type_Debit 0.006669 0.666949 9 Location_Canada 0.006664 0.666361 29 MCC Category_Groceries 0.006568 0.656819 37 MCC Category_Entertainment 0.006450 0.644988 31 Location_US 0.005892 0.589190 32 Device_POS 0.005851 0.585146 23 Card Type_Prepaid 0.005776 0.577638 35 MCC Category_Restaurants 0.005445 0.544544 17 MCC Category_Travel 0.005074 0.507377 36 Device_Mobile 0.004344 0.434388 0 Location_Germany 0.003935 0.393494 4 Location_UK 0.003292 0.329171 3 MCC Category_Clothing 0.002744 0.274430
Card Limit and Credit Score are the top most important features holding more than 10% of total feature importance and Location_UK, MCC_Category_Clothing are the least important features holding less than 0.5% feature importance
#Column list after removing the inter correlated variables are used
X=categorical_df[column_list2]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 80% for training and 20% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=Y)
Data Standardization
## Normalizing the data using z score normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
For the algorithms to converge faster feature scaling is done by z score normalization. Data Standardization is done after train and test split to avoid data leakage during training.
#Defining Logistic regression function with default parameters from sklearn module
model=LogisticRegression(random_state=45)
#Fit the model using standardized training data
model.fit(X_train_scaled, y_train)
#Predictions are made using built model on train and test data
y_pred_train=model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
##As it is a classification model - Accuracy, Precision,Recall and F1 score is calculated
## Classification report function of sklearn module is used to print the evaluation metrics
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 0.00 0.00 0.00 42
accuracy 0.95 800
macro avg 0.47 0.50 0.49 800
weighted avg 0.90 0.95 0.92 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
## As it is a class imbalance problem, the most appropriate metric to tune is f1 score which balances both precision and recall.
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.0 0.0
Classification report for train and test data shows that the model classifies all transactions as non fraudulent transactions and the model is not able to classify fraudulent transactions as the train and test f1 score is 0.
Hyperparameters Tuned for Logistic Regression -
## Creating the parameter space
##Creating 3 different parameter space as some of the solvers are not consistent with some of the regularization techniques
param_grid_liblinear_saga = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l1','l2'],
'logisticregression__solver': ['liblinear', 'saga'], # 'l1' and 'l2' penalty supports 'liblinear' and 'saga'
# 'logisticregression__max_iter': [100, 200, 300],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_lbfgs_newton_cg = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l2',None],
'logisticregression__solver': ['lbfgs', 'newton-cg','newton-cholesky','sag'], # 'l2' penalty supports 'lbfgs' and 'newton-cg'
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_elasticnet = {
'logisticregression__C': [0.001, 0.01, 0.1, 1,10,100],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5,0.05,0.9,0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga'], # elasticnet supports only saga solver
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
# Create a pipeline with standardization and logistic regression
# For Logistic regression max iteration is set to 1000 to balance the accuracy and convergence time and random state as 45 to get consisten results
pipeline = Pipeline([
('scaler', StandardScaler()),
('logisticregression', LogisticRegression(max_iter=1000,random_state=45))
])
#Using Stratified K fold cross validation as the dataset has a huge imbalance in target variable
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV to explore the hyperparameter combinations
# 3 Grid Search is initialized for each of the parameter space
grid_search_liblinear_saga = GridSearchCV(estimator=pipeline,param_grid=param_grid_liblinear_saga,cv=skf,n_jobs=-1,scoring='f1')
grid_search_lbfgs_newton_cg = GridSearchCV(estimator=pipeline,param_grid=param_grid_lbfgs_newton_cg,cv=skf,n_jobs=-1,scoring='f1')
grid_search_elasticnet = GridSearchCV(estimator=pipeline,param_grid=param_grid_elasticnet,cv=skf,n_jobs=-1,scoring='f1')
# Fit the model using GridSearchCV to find the best hyperparameter combination
grid_search_liblinear_saga.fit(X_train, y_train)
grid_search_lbfgs_newton_cg.fit(X_train, y_train)
grid_search_elasticnet.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000, random_state=45))])StandardScaler()
LogisticRegression(max_iter=1000, random_state=45)
#The best parameter combination obtained from grid search 1 is evaluated using f1 score
best_model = grid_search_liblinear_saga.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0997624703087886 Test F1 Score: 0.1042654028436019
#Best parameter combination from Grid search 1
grid_search_liblinear_saga.best_params_
{'logisticregression__C': 0.01,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l1',
'logisticregression__solver': 'saga'}
#The best parameter combination obtained from grid search 2 is evaluated using f1 score
best_model = grid_search_lbfgs_newton_cg.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0759493670886076 Test F1 Score: 0.043478260869565216
#Best parameter combination from Grid Seach 2
grid_search_lbfgs_newton_cg.best_params_
{'logisticregression__C': 10,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l2',
'logisticregression__solver': 'sag'}
#The best parameter combination obtained from grid search 3 is evaluated using f1 score
best_model = grid_search_elasticnet.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0997624703087886 Test F1 Score: 0.1042654028436019
#Best parameter combination from Grid Search 3
grid_search_elasticnet.best_params_
{'logisticregression__C': 0.001,
'logisticregression__class_weight': 'balanced',
'logisticregression__l1_ratio': 0.1,
'logisticregression__penalty': 'elasticnet',
'logisticregression__solver': 'saga'}
As we have intialized 3 different parameter space for each of the solvers the final best parameter combination is chosen among the 3 grid search using f1 score as the dataset has a huge imbalance. The first and third parameter combination has less bias and less variance than 2nd hyperparameter combination
##Choose one with the best results from the above three combinations to print the classification report
#For example here grid_search_elasticnet performed better than others
best_model = grid_search_liblinear_saga.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred_test = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
##Calculating the Classification Report of the best parameter combination
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.00 0.00 0.00 758
1 0.05 1.00 0.10 42
accuracy 0.05 800
macro avg 0.03 0.50 0.05 800
weighted avg 0.00 0.05 0.01 800
Classification Report of Test Data
precision recall f1-score support
0 0.00 0.00 0.00 189
1 0.06 1.00 0.10 11
accuracy 0.06 200
macro avg 0.03 0.50 0.05 200
weighted avg 0.00 0.06 0.01 200
Evaluation results shows that the model has predicted everything as fraudulent class as with respect to fraudulent class the recall value is 1 but with respect to non-fraudulent class the recall is 0 for both training and test data. The model has a high bias and less variance between the training and test data as the f1 score for both train and test data is around 0.10.
#Printing the confusion matrix for both training and test data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Train')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Test')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
accuracy = accuracy_score(y_test, y_pred_test)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.06 Recall (TPR): 1.00 F1 Score: 0.10 Accuracy: 0.06 True Positive Rate (TPR): 1.00 False Positive Rate (FPR): 1.00 False Negative Rate (FNR): 0.00 True Negative Rate (TNR): 0.00
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.5
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
##Initializing K Nearest Neighbors Classifier of scikit learn
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
##Fit the model with the scaled data as K Nearest Neighbors is a distance based classifier
knn_clf.fit(X_train_scaled, y_train)
y_pred_train=knn_clf.predict(X_train_scaled)
y_pred_test = knn_clf.predict(X_test_scaled)
#Show the classification report of training and test data
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 1.00 0.02 0.05 42
accuracy 0.95 800
macro avg 0.97 0.51 0.51 800
weighted avg 0.95 0.95 0.92 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.99 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.48 200
weighted avg 0.89 0.94 0.92 200
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.046511627906976744 0.0
Evaluation metrics on train and test data shows that model is able to classify non fraudulent transactions effectively in train data and test data but unable to classify the fraudulent transactions
Algorithm - Underlying method used to compute the nearest neighbors
The metric is chosen as minkowski whereas p is adjusted as 1,2 and so on to specify the norm
#Creating the parameter space for K Nearest Neighbors
param_grid= {
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1,2,3,4],
'knn__weights': ['uniform', 'distance'],
'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
# Create a pipeline with standardization and K Nearest neighbor Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier(metric='minkowski'))
])
#Intializing Stratified K fold cross validation with folds=5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV
grid_search_knn = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_knn.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
#best parameter combination from the grid search
grid_search_knn.best_params_
{'knn__algorithm': 'auto',
'knn__n_neighbors': 3,
'knn__p': 1,
'knn__weights': 'uniform'}
#F1 score for both train and test data
best_model = grid_search_knn.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.13333333333333333 Test F1 Score: 0.0
#Show the classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 1.00 0.07 0.13 42
accuracy 0.95 800
macro avg 0.98 0.54 0.55 800
weighted avg 0.95 0.95 0.93 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.99 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.49 0.48 200
weighted avg 0.89 0.94 0.91 200
The above best hyperparameter combination of KNN model is able to classify non fraudulent transactions in both train and test data but is unable to classify fraudulent transactions in test data and only 7% in train data. Thus the model has high bias
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.94 Misclassification Rate: 0.06 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.01 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 0.99
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.5052910052910053
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
Guassian Naive Bayes classifier is used as the dataset has a mix of categorical and continous variables.
#GaussianNB is initialized from sklearn library
from sklearn.naive_bayes import GaussianNB
nb_clf=GaussianNB()
#Train the model using Gaussian NB Classifier
#Scaling is not required as it is a probabilistic generative classifier
nb_clf.fit(X_train, y_train)
y_pred_train=nb_clf.predict(X_train)
y_pred_test = nb_clf.predict(X_test)
#Print the Classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 0.00 0.00 0.00 42
accuracy 0.95 800
macro avg 0.47 0.50 0.49 800
weighted avg 0.90 0.95 0.92 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
0.0
0.0
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0 in both train data and test data as in both cases the model classifies all transactions as non fraudulent transactions
Variable Smoothing - Adding numerical stability
Another hyperparameter technique is to convert the categorical variables to continous features and that is done in the feature engineering section
#Creating the parameter space
param_grid= {
'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
# Create a pipeline with Naive Bayes Classifier
#Naive Bayes works on probability so standardization is not required
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('nb', GaussianNB())
])
#Initialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1
grid_search_nb = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_nb.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')Pipeline(steps=[('nb', GaussianNB())])GaussianNB()
#Best parameter combination
grid_search_nb.best_params_
{'nb__var_smoothing': 1e-09}
#Evaluating the bet parameter combination
best_model = grid_search_nb.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0 Test F1 Score: 0.0
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 0.00 0.00 0.00 42
accuracy 0.95 800
macro avg 0.47 0.50 0.49 800
weighted avg 0.90 0.95 0.92 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.94 Misclassification Rate: 0.06 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.00 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 1.00
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.37421837421837423
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
AdaBoost Classifier is tried with Logistic Regression as base estimator
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=42)
ada_clf.fit(X_train_scaled, y_train)
y_pred_train=ada_clf.predict(X_train_scaled)
y_pred_test = ada_clf.predict(X_test_scaled)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 758
1 0.00 0.00 0.00 42
accuracy 0.95 800
macro avg 0.47 0.50 0.49 800
weighted avg 0.90 0.95 0.92 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0 in train data and test data as in both train and test data the model classifies all transactions as non fraudulent transactions
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__C': [1.0,0,0.01,0.1,0.001],
'ada__base_estimator__penalty': ['l2','l1','elasticnet'],
'ada__base_estimator__class_weight':[None,'balanced']
}
# Create a pipeline with standardization and AdaBoost Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),random_state=45))
])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV
grid_search_ada_v1 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v1.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))])StandardScaler()
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45)LogisticRegression(random_state=45)
LogisticRegression(random_state=45)
grid_search_ada_v1.best_params_
{'ada__base_estimator__C': 0.01,
'ada__base_estimator__class_weight': 'balanced',
'ada__base_estimator__penalty': 'l2',
'ada__learning_rate': 0.1,
'ada__n_estimators': 200}
best_model = grid_search_ada_v1.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.10759493670886076 Test F1 Score: 0.11494252873563218
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.96 0.27 0.42 758
1 0.06 0.81 0.11 42
accuracy 0.29 800
macro avg 0.51 0.54 0.26 800
weighted avg 0.91 0.29 0.40 800
Classification Report of Test Data
precision recall f1-score support
0 0.97 0.19 0.32 189
1 0.06 0.91 0.11 11
accuracy 0.23 200
macro avg 0.52 0.55 0.22 200
weighted avg 0.92 0.23 0.31 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.06 Recall (TPR): 0.91 F1 Score: 0.11 Accuracy: 0.23 Misclassification Rate: 0.77 True Positive Rate (TPR): 0.91 False Positive Rate (FPR): 0.81 False Negative Rate (FNR): 0.09 True Negative Rate (TNR): 0.19
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.5541125541125541
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Using the column list before removing features with inter correlation as tree based models are not affected by the intercorrelation
X=categorical_df[column_list]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 80% for training and 20% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=Y)
##Initializing the Decision Tree Classifier of sklearn library
model=DecisionTreeClassifier(random_state=45)
model.fit(X_train, y_train)
y_pred_train=model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.95 0.95 189
1 0.00 0.00 0.00 11
accuracy 0.90 200
macro avg 0.47 0.48 0.47 200
weighted avg 0.89 0.90 0.90 200
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.0
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
#Creating the parameter space
param_grid= {
'classifier__criterion': ['gini', 'entropy'], # Criterion for measuring quality of splits
'classifier__max_depth': [5, 10, 20, 30, 40, 50], # Maximum depth of the tree
'classifier__min_samples_split': [2, 5, 10,15], # Minimum number of samples required to split an internal node
'classifier__min_samples_leaf': [1, 2, 4,5], # Minimum number of samples required to be at a leaf node
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]#Class weight can be passed as dictionary mentioning the weight of each class
}
# Create a pipeline with Decision Tree Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('classifier', DecisionTreeClassifier(random_state=45))
])
#Intialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV and the metric to be tuned=f1
grid_search_dt = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_dt.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')Pipeline(steps=[('classifier', DecisionTreeClassifier(random_state=45))])DecisionTreeClassifier(random_state=45)
from sklearn.metrics import f1_score
best_model = grid_search_dt.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.34854771784232363 Test F1 Score: 0.22950819672131148
grid_search_dt.best_params_
{'classifier__class_weight': {0: 1, 1: 1000},
'classifier__criterion': 'entropy',
'classifier__max_depth': 20,
'classifier__max_features': 'sqrt',
'classifier__min_samples_leaf': 1,
'classifier__min_samples_split': 2}
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.79 0.88 758
1 0.21 1.00 0.35 42
accuracy 0.80 800
macro avg 0.61 0.90 0.62 800
weighted avg 0.96 0.80 0.86 800
Classification Report of Test Data
precision recall f1-score support
0 0.97 0.77 0.86 189
1 0.14 0.64 0.23 11
accuracy 0.77 200
macro avg 0.56 0.70 0.55 200
weighted avg 0.93 0.77 0.83 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.14 Recall (TPR): 0.64 F1 Score: 0.23 Accuracy: 0.77 Misclassification Rate: 0.23 True Positive Rate (TPR): 0.64 False Positive Rate (FPR): 0.23 False Negative Rate (FNR): 0.36 True Negative Rate (TNR): 0.77
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.6960076960076961
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Initializing random Forest classifier of sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=45)
rf_clf.fit(X_train, y_train)
y_pred_train=rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.0
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
#Setting the parameter space for hyperparameter tuning
#RandomizedSearchCV is used as the number of hyperparameter combinations is very high so the model takes more time when using GridSearch
#Trying a subset of hyperparameter combinations using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid= {
'rf__n_estimators': [50,100,200],
'rf__max_depth': [5,10, 15,20],
'rf__min_samples_split': [2,5,8,10,11],
'rf__min_samples_leaf':[1,2,3,4,5],
'rf__max_features': ['auto', 'sqrt', 'log2'],
'rf__bootstrap': [True, False],
'rf__class_weight':[None,'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with Random Forest Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('rf', RandomForestClassifier(random_state=45))
])
#Initialize Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize RandomizedSearchCV with metric to be tuned as f1
grid_search_rf = RandomizedSearchCV(estimator=pipeline,param_distributions=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('rf', RandomForestClassifier(random_state=45))])RandomForestClassifier(random_state=45)
#Best parameter combination
grid_search_rf.best_params_
{'rf__n_estimators': 50,
'rf__min_samples_split': 10,
'rf__min_samples_leaf': 2,
'rf__max_features': 'sqrt',
'rf__max_depth': 5,
'rf__class_weight': {0: 1, 1: 1000},
'rf__bootstrap': False}
#Evaluating the best parameter combination
best_model = grid_search_rf.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.1222707423580786 Test F1 Score: 0.09782608695652173
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.20 0.34 758
1 0.07 1.00 0.12 42
accuracy 0.25 800
macro avg 0.53 0.60 0.23 800
weighted avg 0.95 0.25 0.33 800
Classification Report of Test Data
precision recall f1-score support
0 0.93 0.13 0.23 189
1 0.05 0.82 0.10 11
accuracy 0.17 200
macro avg 0.49 0.48 0.16 200
weighted avg 0.88 0.17 0.22 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.05 Recall (TPR): 0.82 F1 Score: 0.10 Accuracy: 0.17 Misclassification Rate: 0.83 True Positive Rate (TPR): 0.82 False Positive Rate (FPR): 0.87 False Negative Rate (FNR): 0.18 True Negative Rate (TNR): 0.13
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve-Random Forest')
plt.legend()
plt.show()
AUC-ROC: 0.5836940836940837
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#AdaBoost Classifier with decision tree as base model for the boosting rounds
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
#Training the model
ada_clf.fit(X_train, y_train)
y_pred_train=ada_clf.predict(X_train)
y_pred_test = ada_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.93 0.93 189
1 0.00 0.00 0.00 11
accuracy 0.88 200
macro avg 0.47 0.46 0.47 200
weighted avg 0.89 0.88 0.88 200
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but .93 and 0 and in test data as in test data the model classifies Most of the transactions as non fraudulent transactions
##Creating the parameter space for the ada boost algorithm
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__max_depth': [2,3,4,5],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with AdaBoost Classifier. Standardization is not required for tree based model
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),random_state=45))
])
#Initializing Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1 score
grid_search_ada_v2 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v2.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))])AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45)DecisionTreeClassifier(random_state=45)
DecisionTreeClassifier(random_state=45)
#Best parameter combinations
grid_search_ada_v2.best_params_
{'ada__base_estimator__class_weight': {0: 1, 1: 100},
'ada__base_estimator__criterion': 'gini',
'ada__base_estimator__max_depth': 2,
'ada__learning_rate': 1.0,
'ada__n_estimators': 50}
#Evaluating the best hyperparameter combination
best_model = grid_search_ada_v2.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 1.0 Test F1 Score: 0.19999999999999998
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.96 0.96 189
1 0.22 0.18 0.20 11
accuracy 0.92 200
macro avg 0.59 0.57 0.58 200
weighted avg 0.91 0.92 0.92 200
##Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.22 Recall (TPR): 0.18 F1 Score: 0.20 Accuracy: 0.92 Misclassification Rate: 0.08 True Positive Rate (TPR): 0.18 False Positive Rate (FPR): 0.04 False Negative Rate (FNR): 0.82 True Negative Rate (TNR): 0.96
##ROC CURVE
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
#Plotting the ROC Curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.6137566137566137
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
Overall Performance: -Comparing All algorithms
#The hyperparameter combination that acheived good results is used
tree_model=DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)
tree_model.fit(X_train,y_train)
DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)##Visualizing the decision tree
from sklearn.tree import plot_tree
plt.figure(figsize=(15,10)) # Set the figure size
plot_tree(tree_model, filled=True, feature_names=column_list2, class_names=categorical_df['Is Fraudulent'].astype(str))
plt.title("Decision Tree")
plt.show()
# 2. Plot Feature Importances
# Get feature importances
importances = tree_model.feature_importances_
# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
'Feature': column_list,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(8,6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette="viridis")
plt.title("Feature Importance in Decision Tree Classifier")
plt.show()
Velocity holds the highest feature importance followed by Customer Age and Day
#Column list after removing the inter correlated variables are used
X=categorical_df[column_list2]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 80% for training and 20% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42,stratify=Y)
Data Standardization
## Normalizing the data using z score normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
For the algorithms to converge faster feature scaling is done by z score normalization. Data Standardization is done after train and test split to avoid data leakage during training.
#Defining Logistic regression function with default parameters from sklearn module
model=LogisticRegression(random_state=45)
#Fit the model using standardized training data
model.fit(X_train_scaled, y_train)
#Predictions are made using built model on train and test data
y_pred_train=model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
##As it is a classification model - Accuracy, Precision,Recall and F1 score is calculated
## Classification report function of sklearn module is used to print the evaluation metrics
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 663
1 0.00 0.00 0.00 37
accuracy 0.95 700
macro avg 0.47 0.50 0.49 700
weighted avg 0.90 0.95 0.92 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.95 300
macro avg 0.47 0.50 0.49 300
weighted avg 0.90 0.95 0.92 300
## As it is a class imbalance problem, the most appropriate metric to tune is f1 score which balances both precision and recall.
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.0 0.0
Classification report for train and test data shows that the model classifies all transactions as non fraudulent transactions and the model is not able to classify fraudulent transactions as the train and test f1 score is 0.
Hyperparameters Tuned for Logistic Regression -
## Creating the parameter space
##Creating 3 different parameter space as some of the solvers are not consistent with some of the regularization techniques
param_grid_liblinear_saga = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l1','l2'],
'logisticregression__solver': ['liblinear', 'saga'], # 'l1' and 'l2' penalty supports 'liblinear' and 'saga'
# 'logisticregression__max_iter': [100, 200, 300],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_lbfgs_newton_cg = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l2',None],
'logisticregression__solver': ['lbfgs', 'newton-cg','newton-cholesky','sag'], # 'l2' penalty supports 'lbfgs' and 'newton-cg'
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_elasticnet = {
'logisticregression__C': [0.001, 0.01, 0.1, 1,10,100],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5,0.05,0.9,0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga'], # elasticnet supports only saga solver
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
# Create a pipeline with standardization and logistic regression
# For Logistic regression max iteration is set to 1000 to balance the accuracy and convergence time and random state as 45 to get consisten results
pipeline = Pipeline([
('scaler', StandardScaler()),
('logisticregression', LogisticRegression(max_iter=1000,random_state=45))
])
#Using Stratified K fold cross validation as the dataset has a huge imbalance in target variable
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV to explore the hyperparameter combinations
# 3 Grid Search is initialized for each of the parameter space
grid_search_liblinear_saga = GridSearchCV(estimator=pipeline,param_grid=param_grid_liblinear_saga,cv=skf,n_jobs=-1,scoring='f1')
grid_search_lbfgs_newton_cg = GridSearchCV(estimator=pipeline,param_grid=param_grid_lbfgs_newton_cg,cv=skf,n_jobs=-1,scoring='f1')
grid_search_elasticnet = GridSearchCV(estimator=pipeline,param_grid=param_grid_elasticnet,cv=skf,n_jobs=-1,scoring='f1')
# Fit the model using GridSearchCV to find the best hyperparameter combination
grid_search_liblinear_saga.fit(X_train, y_train)
grid_search_lbfgs_newton_cg.fit(X_train, y_train)
grid_search_elasticnet.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000, random_state=45))])StandardScaler()
LogisticRegression(max_iter=1000, random_state=45)
#The best parameter combination obtained from grid search 1 is evaluated using f1 score
best_model = grid_search_liblinear_saga.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.07804878048780488 Test F1 Score: 0.029850746268656716
#Best parameter combination from Grid search 1
grid_search_liblinear_saga.best_params_
{'logisticregression__C': 10,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l1',
'logisticregression__solver': 'liblinear'}
#The best parameter combination obtained from grid search 2 is evaluated using f1 score
best_model = grid_search_lbfgs_newton_cg.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.07843137254901962 Test F1 Score: 0.029850746268656716
#Best parameter combination from Grid Seach 2
grid_search_lbfgs_newton_cg.best_params_
{'logisticregression__C': 100,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l2',
'logisticregression__solver': 'sag'}
#The best parameter combination obtained from grid search 3 is evaluated using f1 score
best_model = grid_search_elasticnet.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.07843137254901962 Test F1 Score: 0.029850746268656716
#Best parameter combination from Grid Search 3
grid_search_elasticnet.best_params_
{'logisticregression__C': 100,
'logisticregression__class_weight': 'balanced',
'logisticregression__l1_ratio': 0.01,
'logisticregression__penalty': 'elasticnet',
'logisticregression__solver': 'saga'}
As we have intialized 3 different parameter space for each of the solvers the final best parameter combination is chosen among the 3 grid search using f1 score as the dataset has a huge imbalance. The first and third parameter combination has less bias and less variance than 2nd hyperparameter combination
##Choose one with the best results from the above three combinations to print the classification report
#For example here grid_search_elasticnet performed better than others
best_model = grid_search_liblinear_saga.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred_test = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
##Calculating the Classification Report of the best parameter combination
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.76 0.84 663
1 0.05 0.22 0.08 37
accuracy 0.73 700
macro avg 0.50 0.49 0.46 700
weighted avg 0.90 0.73 0.80 700
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.82 0.88 284
1 0.02 0.06 0.03 16
accuracy 0.78 300
macro avg 0.48 0.44 0.45 300
weighted avg 0.89 0.78 0.83 300
Class 0 (Non-fraudulent):
Precision: 0.94,Recall: 0.82,F1-Score: 0.88 The model is still performing well on non-fraudulent transactions, but recall has decreased from the training set.
Class 1 (Fraudulent): Precision: 0.02,Recall: 0.06,F1-Score: 0.03 The model struggles even more in detecting fraudulent transactions in the test data, with very low precision and recall, leading to a poor F1-score. Accuracy: 78%, but like the training data, this is not reflective of good model performance due to the imbalance. Macro Avg (0.48) and Weighted Avg (0.78) similarly reflect the model’s imbalance issue.
The model doesn't exhibit significant variance between training and test data (performance is consistent), but it has high bias, particularly in detecting fraudulent transactions
#Printing the confusion matrix for both training and test data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Train')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Test')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
accuracy = accuracy_score(y_test, y_pred_test)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.02 Recall (TPR): 0.06 F1 Score: 0.03 Accuracy: 0.78 Misclassification Rate: 0.22 True Positive Rate (TPR): 0.06 False Positive Rate (FPR): 0.18 False Negative Rate (FNR): 0.94 True Negative Rate (TNR): 0.82
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.41659330985915494
Evaluation of best parameters obtained from Grid Search
The model performs poorly on identifying fraudulent transactions (low recall and F1 score), which is crucial in fraud detection. Although the accuracy and true negative rate are relatively high, these metrics are misleading in the context of an imbalanced dataset. The false negative rate is concerningly high, meaning the model misses most of the actual fraud cases, while the false positive rate shows that the model incorrectly flags many legitimate transactions as fraudulent. The overall conclusion is that the model needs significant improvement, particularly in handling the minority class (fraudulent transactions). Focusing on recall (TPR) and lowering FNR are key in making the model more effective for fraud detection
Comparison of Model Before and After Tuning:
##Initializing K Nearest Neighbors Classifier of scikit learn
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
##Fit the model with the scaled data as K Nearest Neighbors is a distance based classifier
knn_clf.fit(X_train_scaled, y_train)
y_pred_train=knn_clf.predict(X_train_scaled)
y_pred_test = knn_clf.predict(X_test_scaled)
#Show the classification report of training and test data
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 663
1 0.00 0.00 0.00 37
accuracy 0.95 700
macro avg 0.47 0.50 0.49 700
weighted avg 0.90 0.95 0.92 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.94 300
macro avg 0.47 0.50 0.49 300
weighted avg 0.90 0.94 0.92 300
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.0 0.0
The model performs very well for class 0 (non-fraudulent transactions), achieving high precision, recall, and F1-scores. However, it performs very poorly for class 1 (fraudulent transactions), with precision, recall, and F1-scores all at 0. This means the model is not identifying any fraudulent transactions, both in training and test datasets.
Algorithm - Underlying method used to compute the nearest neighbors
The metric is chosen as minkowski whereas p is adjusted as 1,2 and so on to specify the norm
#Creating the parameter space for K Nearest Neighbors
param_grid= {
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1,2,3,4],
'knn__weights': ['uniform', 'distance'],
'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
# Create a pipeline with standardization and K Nearest neighbor Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier(metric='minkowski'))
])
#Intializing Stratified K fold cross validation with folds=5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV
grid_search_knn = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_knn.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
#best parameter combination from the grid search
grid_search_knn.best_params_
{'knn__algorithm': 'auto',
'knn__n_neighbors': 3,
'knn__p': 1,
'knn__weights': 'uniform'}
#F1 score for both train and test data
best_model = grid_search_knn.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.15 Test F1 Score: 0.0
#Show the classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 663
1 1.00 0.08 0.15 37
accuracy 0.95 700
macro avg 0.98 0.54 0.56 700
weighted avg 0.95 0.95 0.93 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.99 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.94 300
macro avg 0.47 0.50 0.48 300
weighted avg 0.90 0.94 0.92 300
The model performs very well for class 0 (non-fraudulent transactions), achieving high precision, recall, and F1-scores. However, it performs very poorly for class 1 (fraudulent transactions), as evidenced by the precision, recall, and F1-scores of 0 in the test data. The high overall accuracy (0.94) is misleading due to the imbalance between non-fraudulent and fraudulent transactions.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.94 Misclassification Rate: 0.06 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.01 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 0.99
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.4810739436619718
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
Guassian Naive Bayes classifier is used as the dataset has a mix of categorical and continous variables.
#GaussianNB is initialized from sklearn library
from sklearn.naive_bayes import GaussianNB
nb_clf=GaussianNB()
#Train the model using Gaussian NB Classifier
#Scaling is not required as it is a probabilistic generative classifier
nb_clf.fit(X_train, y_train)
y_pred_train=nb_clf.predict(X_train)
y_pred_test = nb_clf.predict(X_test)
#Print the Classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.99 0.97 663
1 0.00 0.00 0.00 37
accuracy 0.94 700
macro avg 0.47 0.50 0.48 700
weighted avg 0.90 0.94 0.92 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.99 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.93 300
macro avg 0.47 0.49 0.48 300
weighted avg 0.90 0.93 0.91 300
0.0
0.0
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0 in both train data and test data as in both cases the model classifies all transactions as non fraudulent transactions
Variable Smoothing - Adding numerical stability
Another hyperparameter technique is to convert the categorical variables to continous features and that is done in the feature engineering section
#Creating the parameter space
param_grid= {
'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
# Create a pipeline with Naive Bayes Classifier
#Naive Bayes works on probability so standardization is not required
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('nb', GaussianNB())
])
#Initialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1
grid_search_nb = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_nb.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')Pipeline(steps=[('nb', GaussianNB())])GaussianNB()
#Best parameter combination
grid_search_nb.best_params_
{'nb__var_smoothing': 1e-09}
#Evaluating the bet parameter combination
best_model = grid_search_nb.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0 Test F1 Score: 0.0
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.99 0.97 663
1 0.00 0.00 0.00 37
accuracy 0.94 700
macro avg 0.47 0.50 0.48 700
weighted avg 0.90 0.94 0.92 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.99 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.93 300
macro avg 0.47 0.49 0.48 300
weighted avg 0.90 0.93 0.91 300
Training Data: Class 0 (Non-Fraudulent Transactions):
Precision: 0.95 – The model correctly identifies 95% of the transactions it predicts as non-fraudulent. Recall: 0.99 – It identifies 99% of the actual non-fraudulent transactions correctly. F1-Score: 0.97 – A strong balance between precision and recall for non-fraudulent transactions, indicating excellent performance.
Class 1 (Fraudulent Transactions):
Precision: 0.00 – This indicates that the model never predicts any fraudulent transactions correctly. Recall: 0.00 – The model fails to identify any of the actual fraudulent transactions. F1-Score: 0.00 – Reflects that the model performs poorly in detecting fraudulent transactions. Accuracy: 0.94 – The overall accuracy of the model is quite high (94%), but this is largely due to the dominance of non-fraudulent transactions (class 0). Test Data: Class 0 (Non-Fraudulent Transactions):
Precision: 0.95 – Similar to the training data, the model correctly identifies 95% of the non-fraudulent transactions it predicts. Recall: 0.99 – It correctly identifies 99% of the actual non-fraudulent transactions. F1-Score: 0.97 – This reflects strong performance for non-fraudulent transactions. Class 1 (Fraudulent Transactions):
Precision: 0.00 – The model does not correctly predict any fraudulent transactions. Recall: 0.00 – The model fails to identify any fraudulent transactions. F1-Score: 0.00 – This indicates complete failure in detecting fraudulent transactions in the test data.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.93 Misclassification Rate: 0.07 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.01 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 0.99
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.39348591549295775
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
AdaBoost Classifier is tried with Logistic Regression as base estimator
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=42)
ada_clf.fit(X_train_scaled, y_train)
y_pred_train=ada_clf.predict(X_train_scaled)
y_pred_test = ada_clf.predict(X_test_scaled)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 663
1 0.00 0.00 0.00 37
accuracy 0.95 700
macro avg 0.47 0.50 0.49 700
weighted avg 0.90 0.95 0.92 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.95 300
macro avg 0.47 0.50 0.49 300
weighted avg 0.90 0.95 0.92 300
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0 in train data and test data as in both train and test data the model classifies all transactions as non fraudulent transactions
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__C': [1.0,0,0.01,0.1,0.001],
'ada__base_estimator__penalty': ['l2','l1','elasticnet'],
'ada__base_estimator__class_weight':[None,'balanced']
}
# Create a pipeline with standardization and AdaBoost Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),random_state=45))
])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV
grid_search_ada_v1 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v1.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))])StandardScaler()
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45)LogisticRegression(random_state=45)
LogisticRegression(random_state=45)
grid_search_ada_v1.best_params_
{'ada__base_estimator__C': 0.001,
'ada__base_estimator__class_weight': None,
'ada__base_estimator__penalty': 'l2',
'ada__learning_rate': 2.0,
'ada__n_estimators': 50}
best_model = grid_search_ada_v1.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.12413793103448277 Test F1 Score: 0.0425531914893617
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.85 0.90 663
1 0.08 0.24 0.12 37
accuracy 0.82 700
macro avg 0.52 0.55 0.51 700
weighted avg 0.91 0.82 0.86 700
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.89 0.92 284
1 0.03 0.06 0.04 16
accuracy 0.85 300
macro avg 0.49 0.48 0.48 300
weighted avg 0.90 0.85 0.87 300
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.03 Recall (TPR): 0.06 F1 Score: 0.04 Accuracy: 0.85 Misclassification Rate: 0.15 True Positive Rate (TPR): 0.06 False Positive Rate (FPR): 0.11 False Negative Rate (FNR): 0.94 True Negative Rate (TNR): 0.89
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.4689700704225352
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Using the column list before removing features with inter correlation as tree based models are not affected by the intercorrelation
X=categorical_df[column_list]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 80% for training and 20% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42,stratify=Y)
##Initializing the Decision Tree Classifier of sklearn library
model=DecisionTreeClassifier(random_state=45)
model.fit(X_train, y_train)
y_pred_train=model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 663
1 1.00 1.00 1.00 37
accuracy 1.00 700
macro avg 1.00 1.00 1.00 700
weighted avg 1.00 1.00 1.00 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.95 0.95 284
1 0.19 0.19 0.19 16
accuracy 0.91 300
macro avg 0.57 0.57 0.57 300
weighted avg 0.91 0.91 0.91 300
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.1875
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0.18 in test data as in test data the model classifies most of transactions as non fraudulent transactions
X_train.columns
Index(['Location_Germany', 'Location_France', 'MCC Category_Healthcare',
'MCC Category_Clothing', 'Location_UK', 'Previous Transactions',
'Merchant Location History', 'Credit Score', 'balance_income_ratio',
'Location_Canada', 'Balance Before Transaction', 'geo_spending_profile',
'Spending Patterns', 'credit_merchant_risk',
'merchant_reputation_encoded', 'Card Limit', 'OT_Frequency_encoded',
'MCC Category_Travel', 'Card Type_Credit', 'Customer Income',
'time_based_interaction', 'Device_Desktop', 'Day', 'Card Type_Prepaid',
'Customer Age', 'Time of Day', 'spending_pattern_score',
'risk_adjusted_limit', 'Month', 'MCC Category_Groceries', 'Velocity',
'Location_US', 'Device_POS', 'MCC Category_Electronics', 'risk_score',
'MCC Category_Restaurants', 'Device_Mobile',
'MCC Category_Entertainment', 'Card Type_Debit'],
dtype='object')
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
#Creating the parameter space
param_grid= {
'classifier__criterion': ['gini', 'entropy'], # Criterion for measuring quality of splits
'classifier__max_depth': [5, 10, 20, 30, 40, 50], # Maximum depth of the tree
'classifier__min_samples_split': [2, 5, 10,15], # Minimum number of samples required to split an internal node
'classifier__min_samples_leaf': [1, 2, 4,5], # Minimum number of samples required to be at a leaf node
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]#Class weight can be passed as dictionary mentioning the weight of each class
}
# Create a pipeline with Decision Tree Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('classifier', DecisionTreeClassifier(random_state=45))
])
#Intialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV and the metric to be tuned=f1
grid_search_dt = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_dt.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')Pipeline(steps=[('classifier', DecisionTreeClassifier(random_state=45))])DecisionTreeClassifier(random_state=45)
from sklearn.metrics import f1_score
best_model = grid_search_dt.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.5362318840579711 Test F1 Score: 0.06896551724137931
grid_search_dt.best_params_
{'classifier__class_weight': {0: 1, 1: 1000},
'classifier__criterion': 'entropy',
'classifier__max_depth': 30,
'classifier__max_features': 'log2',
'classifier__min_samples_leaf': 5,
'classifier__min_samples_split': 2}
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.90 0.95 663
1 0.37 1.00 0.54 37
accuracy 0.91 700
macro avg 0.68 0.95 0.74 700
weighted avg 0.97 0.91 0.93 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.86 0.90 284
1 0.05 0.12 0.07 16
accuracy 0.82 300
macro avg 0.50 0.49 0.48 300
weighted avg 0.90 0.82 0.86 300
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.05 Recall (TPR): 0.12 F1 Score: 0.07 Accuracy: 0.82 Misclassification Rate: 0.18 True Positive Rate (TPR): 0.12 False Positive Rate (FPR): 0.14 False Negative Rate (FNR): 0.88 True Negative Rate (TNR): 0.86
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.488556338028169
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Initializing random Forest classifier of sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=45)
rf_clf.fit(X_train, y_train)
y_pred_train=rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 663
1 1.00 1.00 1.00 37
accuracy 1.00 700
macro avg 1.00 1.00 1.00 700
weighted avg 1.00 1.00 1.00 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 284
1 0.00 0.00 0.00 16
accuracy 0.95 300
macro avg 0.47 0.50 0.49 300
weighted avg 0.90 0.95 0.92 300
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.0
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
#Setting the parameter space for hyperparameter tuning
#RandomizedSearchCV is used as the number of hyperparameter combinations is very high so the model takes more time when using GridSearch
#Trying a subset of hyperparameter combinations using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid= {
'rf__n_estimators': [50,100,200],
'rf__max_depth': [5,10, 15,20],
'rf__min_samples_split': [2,5,8,10,11],
'rf__min_samples_leaf':[1,2,3,4,5],
'rf__max_features': ['auto', 'sqrt', 'log2'],
'rf__bootstrap': [True, False],
'rf__class_weight':[None,'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with Random Forest Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('rf', RandomForestClassifier(random_state=45))
])
#Initialize Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize RandomizedSearchCV with metric to be tuned as f1
grid_search_rf = RandomizedSearchCV(estimator=pipeline,param_distributions=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('rf', RandomForestClassifier(random_state=45))])RandomForestClassifier(random_state=45)
#Best parameter combination
grid_search_rf.best_params_
{'rf__n_estimators': 200,
'rf__min_samples_split': 10,
'rf__min_samples_leaf': 5,
'rf__max_features': 'sqrt',
'rf__max_depth': 5,
'rf__class_weight': {0: 1, 1: 100},
'rf__bootstrap': False}
#Evaluating the best parameter combination
best_model = grid_search_rf.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.13805970149253732 Test F1 Score: 0.10126582278481013
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.30 0.47 663
1 0.07 1.00 0.14 37
accuracy 0.34 700
macro avg 0.54 0.65 0.30 700
weighted avg 0.95 0.34 0.45 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.26 0.41 284
1 0.05 0.75 0.10 16
accuracy 0.29 300
macro avg 0.50 0.51 0.26 300
weighted avg 0.90 0.29 0.40 300
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.05 Recall (TPR): 0.75 F1 Score: 0.10 Accuracy: 0.29 Misclassification Rate: 0.71 True Positive Rate (TPR): 0.75 False Positive Rate (FPR): 0.74 False Negative Rate (FNR): 0.25 True Negative Rate (TNR): 0.26
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve-Random Forest')
plt.legend()
plt.show()
AUC-ROC: 0.5400528169014084
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#AdaBoost Classifier with decision tree as base model for the boosting rounds
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
#Training the model
ada_clf.fit(X_train, y_train)
y_pred_train=ada_clf.predict(X_train)
y_pred_test = ada_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 663
1 1.00 1.00 1.00 37
accuracy 1.00 700
macro avg 1.00 1.00 1.00 700
weighted avg 1.00 1.00 1.00 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.94 0.94 284
1 0.06 0.06 0.06 16
accuracy 0.89 300
macro avg 0.50 0.50 0.50 300
weighted avg 0.90 0.89 0.90 300
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0.94 and 0.06 in test data as in test data the model classifies most of data as transactions as non fraudulent transactions
##Creating the parameter space for the ada boost algorithm
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__max_depth': [2,3,4,5],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with AdaBoost Classifier. Standardization is not required for tree based model
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),random_state=45))
])
#Initializing Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1 score
grid_search_ada_v2 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v2.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))])AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45)DecisionTreeClassifier(random_state=45)
DecisionTreeClassifier(random_state=45)
#Best parameter combinations
grid_search_ada_v2.best_params_
{'ada__base_estimator__class_weight': {0: 1, 1: 1000},
'ada__base_estimator__criterion': 'gini',
'ada__base_estimator__max_depth': 4,
'ada__learning_rate': 0.01,
'ada__n_estimators': 200}
#Evaluating the best hyperparameter combination
best_model = grid_search_ada_v2.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.9736842105263158 Test F1 Score: 0.06666666666666667
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 663
1 0.95 1.00 0.97 37
accuracy 1.00 700
macro avg 0.97 1.00 0.99 700
weighted avg 1.00 1.00 1.00 700
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.95 0.95 284
1 0.07 0.06 0.07 16
accuracy 0.91 300
macro avg 0.51 0.51 0.51 300
weighted avg 0.90 0.91 0.90 300
##Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.07 Recall (TPR): 0.06 F1 Score: 0.07 Accuracy: 0.91 Misclassification Rate: 0.09 True Positive Rate (TPR): 0.06 False Positive Rate (FPR): 0.05 False Negative Rate (FNR): 0.94 True Negative Rate (TNR): 0.95
##ROC CURVE
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
#Plotting the ROC Curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.4786531690140845
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Column list after removing the inter correlated variables are used
X=categorical_df[column_list2]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 50% for training and 50% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42,stratify=Y)
Data Standardization
## Normalizing the data using z score normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
For the algorithms to converge faster feature scaling is done by z score normalization. Data Standardization is done after train and test split to avoid data leakage during training.
Logistic Regression
#Defining Logistic regression function with default parameters from sklearn module
model=LogisticRegression(random_state=45)
#Fit the model using standardized training data
model.fit(X_train_scaled, y_train)
#Predictions are made using built model on train and test data
y_pred_train=model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
##As it is a classification model - Accuracy, Precision,Recall and F1 score is calculated
## Classification report function of sklearn module is used to print the evaluation metrics
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 473
1 0.00 0.00 0.00 27
accuracy 0.95 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.89 0.95 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 474
1 0.00 0.00 0.00 26
accuracy 0.95 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.90 0.95 0.92 500
## As it is a class imbalance problem, the most appropriate metric to tune is f1 score which balances both precision and recall.
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.0 0.0
Classification report for train and test data shows that the model classifies all transactions as non fraudulent transactions and the model is not able to classify fraudulent transactions as the train and test f1 score is 0.
Hyperparameters Tuned for Logistic Regression -
## Creating the parameter space
##Creating 3 different parameter space as some of the solvers are not consistent with some of the regularization techniques
param_grid_liblinear_saga = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l1','l2'],
'logisticregression__solver': ['liblinear', 'saga'], # 'l1' and 'l2' penalty supports 'liblinear' and 'saga'
# 'logisticregression__max_iter': [100, 200, 300],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_lbfgs_newton_cg = {
'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,100],
'logisticregression__penalty': ['l2',None],
'logisticregression__solver': ['lbfgs', 'newton-cg','newton-cholesky','sag'], # 'l2' penalty supports 'lbfgs' and 'newton-cg'
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
param_grid_elasticnet = {
'logisticregression__C': [0.001, 0.01, 0.1, 1,10,100],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5,0.05,0.9,0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga'], # elasticnet supports only saga solver
# 'logisticregression__max_iter': [100, 200, 300,400,500],
'logisticregression__class_weight': [None, 'balanced']
}
# Create a pipeline with standardization and logistic regression
# For Logistic regression max iteration is set to 1000 to balance the accuracy and convergence time and random state as 45 to get consisten results
pipeline = Pipeline([
('scaler', StandardScaler()),
('logisticregression', LogisticRegression(max_iter=1000,random_state=45))
])
#Using Stratified K fold cross validation as the dataset has a huge imbalance in target variable
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV to explore the hyperparameter combinations
# 3 Grid Search is initialized for each of the parameter space
grid_search_liblinear_saga = GridSearchCV(estimator=pipeline,param_grid=param_grid_liblinear_saga,cv=skf,n_jobs=-1,scoring='f1')
grid_search_lbfgs_newton_cg = GridSearchCV(estimator=pipeline,param_grid=param_grid_lbfgs_newton_cg,cv=skf,n_jobs=-1,scoring='f1')
grid_search_elasticnet = GridSearchCV(estimator=pipeline,param_grid=param_grid_elasticnet,cv=skf,n_jobs=-1,scoring='f1')
# Fit the model using GridSearchCV to find the best hyperparameter combination
grid_search_liblinear_saga.fit(X_train, y_train)
grid_search_lbfgs_newton_cg.fit(X_train, y_train)
grid_search_elasticnet.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000,
random_state=45))]),
n_jobs=-1,
param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
100],
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__l1_ratio': [0.01, 0.1, 0.5, 0.05,
0.9, 0.09],
'logisticregression__penalty': ['elasticnet'],
'logisticregression__solver': ['saga']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('logisticregression',
LogisticRegression(max_iter=1000, random_state=45))])StandardScaler()
LogisticRegression(max_iter=1000, random_state=45)
#The best parameter combination obtained from grid search 1 is evaluated using f1 score
best_model = grid_search_liblinear_saga.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.10246679316888045 Test F1 Score: 0.0988593155893536
#Best parameter combination from Grid search 1
grid_search_liblinear_saga.best_params_
{'logisticregression__C': 0.01,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l1',
'logisticregression__solver': 'saga'}
#The best parameter combination obtained from grid search 2 is evaluated using f1 score
best_model = grid_search_lbfgs_newton_cg.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.09828009828009829 Test F1 Score: 0.11241217798594846
#Best parameter combination from Grid Seach 2
grid_search_lbfgs_newton_cg.best_params_
{'logisticregression__C': 1,
'logisticregression__class_weight': 'balanced',
'logisticregression__penalty': 'l2',
'logisticregression__solver': 'sag'}
#The best parameter combination obtained from grid search 3 is evaluated using f1 score
best_model = grid_search_elasticnet.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0 Test F1 Score: 0.0
#Best parameter combination from Grid Search 3
grid_search_elasticnet.best_params_
{'logisticregression__C': 0.001,
'logisticregression__class_weight': 'balanced',
'logisticregression__l1_ratio': 0.05,
'logisticregression__penalty': 'elasticnet',
'logisticregression__solver': 'saga'}
As we have intialized 3 different parameter space for each of the solvers the final best parameter combination is chosen among the 3 grid search using f1 score as the dataset has a huge imbalance. The first and second parameter combination has less bias and less variance than 3rd hyperparameter combination
##Choose one with the best results from the above three combinations to print the classification report
#For example here grid_search_elasticnet performed better than others
best_model = grid_search_lbfgs_newton_cg.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred_test = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
##Calculating the Classification Report of the best parameter combination
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.94 0.24 0.38 473
1 0.05 0.74 0.10 27
accuracy 0.27 500
macro avg 0.50 0.49 0.24 500
weighted avg 0.89 0.27 0.37 500
Classification Report of Test Data
precision recall f1-score support
0 0.98 0.20 0.34 474
1 0.06 0.92 0.11 26
accuracy 0.24 500
macro avg 0.52 0.56 0.23 500
weighted avg 0.93 0.24 0.33 500
Evaluation results shows that the model has predicted 74% as fraudulent class as with respect to fraudulent class the recall value is 0.74 and the model has predicted 24% as non-fraudulent class as with respect to non-fraudulent class the recall value is 0.24 for training data.
Evaluation results shows that the model has predicted 92% as fraudulent class as with respect to fraudulent class the recall value is 0.92 and the model has predicted 20% as non-fraudulent class as with respect to non-fraudulent class the recall value is 0.20 for test data.
The model has a high bias and less variance between the training and test data as the f1 score for both train and test data is around 0.10.
#Printing the confusion matrix for both training and test data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Train')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Logistic Regression - Test')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
accuracy = accuracy_score(y_test, y_pred_test)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.06 Recall (TPR): 0.92 F1 Score: 0.11 Accuracy: 0.24 Misclassification Rate: 0.76 True Positive Rate (TPR): 0.92 False Positive Rate (FPR): 0.80 False Negative Rate (FNR): 0.08 True Negative Rate (TNR): 0.20
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.46178188899707884
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
##Initializing K Nearest Neighbors Classifier of scikit learn
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
##Fit the model with the scaled data as K Nearest Neighbors is a distance based classifier
knn_clf.fit(X_train_scaled, y_train)
y_pred_train=knn_clf.predict(X_train_scaled)
y_pred_test = knn_clf.predict(X_test_scaled)
#Show the classification report of training and test data
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 473
1 0.00 0.00 0.00 27
accuracy 0.95 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.89 0.95 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 1.00 0.97 474
1 0.00 0.00 0.00 26
accuracy 0.95 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.90 0.95 0.92 500
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
0.0 0.0
Evaluation metrics on train and test data shows that model is able to classify non fraudulent transactions effectively in train data and test data but unable to classify the fraudulent transactions
Algorithm - Underlying method used to compute the nearest neighbors
The metric is chosen as minkowski whereas p is adjusted as 1,2 and so on to specify the norm
#Creating the parameter space for K Nearest Neighbors
param_grid= {
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1,2,3,4],
'knn__weights': ['uniform', 'distance'],
'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
# Create a pipeline with standardization and K Nearest neighbor Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier(metric='minkowski'))
])
#Intializing Stratified K fold cross validation with folds=5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initializing GridSearchCV
grid_search_knn = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_knn.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 10, 15],
'knn__p': [1, 2, 3, 4],
'knn__weights': ['uniform', 'distance']},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
#best parameter combination from the grid search
grid_search_knn.best_params_
{'knn__algorithm': 'auto',
'knn__n_neighbors': 3,
'knn__p': 1,
'knn__weights': 'uniform'}
#F1 score for both train and test data
best_model = grid_search_knn.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.06896551724137931 Test F1 Score: 0.0
#Show the classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 473
1 0.50 0.04 0.07 27
accuracy 0.95 500
macro avg 0.72 0.52 0.52 500
weighted avg 0.92 0.95 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.99 0.97 474
1 0.00 0.00 0.00 26
accuracy 0.94 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.90 0.94 0.92 500
The above best hyperparameter combination of KNN model is able to classify non fraudulent transactions in both train and test data but is unable to classify fraudulent transactions in test data and only 4% in train data. Thus the model has high bias
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for KNN-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.94 Misclassification Rate: 0.06 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.01 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 0.99
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.49099318403115866
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
Guassian Naive Bayes classifier is used as the dataset has a mix of categorical and continous variables.
#GaussianNB is initialized from sklearn library
from sklearn.naive_bayes import GaussianNB
nb_clf=GaussianNB()
#Train the model using Gaussian NB Classifier
#Scaling is not required as it is a probabilistic generative classifier
nb_clf.fit(X_train, y_train)
y_pred_train=nb_clf.predict(X_train)
y_pred_test = nb_clf.predict(X_test)
#Print the Classification report
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.99 0.97 473
1 0.17 0.04 0.06 27
accuracy 0.94 500
macro avg 0.56 0.51 0.51 500
weighted avg 0.91 0.94 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.98 0.96 474
1 0.00 0.00 0.00 26
accuracy 0.93 500
macro avg 0.47 0.49 0.48 500
weighted avg 0.90 0.93 0.91 500
0.06060606060606061
0.0
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0.06 in train data and 0 in test data as in both cases the model classifies all transactions as non fraudulent transactions
Variable Smoothing - Adding numerical stability
Another hyperparameter technique is to convert the categorical variables to continous features and that is done in the feature engineering section
#Creating the parameter space
param_grid= {
'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
# Create a pipeline with Naive Bayes Classifier
#Naive Bayes works on probability so standardization is not required
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('nb', GaussianNB())
])
#Initialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1
grid_search_nb = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_nb.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('nb', GaussianNB())]), n_jobs=-1,
param_grid={'nb__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06,
1e-05]},
scoring='f1')Pipeline(steps=[('nb', GaussianNB())])GaussianNB()
#Best parameter combination
grid_search_nb.best_params_
{'nb__var_smoothing': 1e-09}
#Evaluating the bet parameter combination
best_model = grid_search_nb.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.06060606060606061 Test F1 Score: 0.0
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.99 0.97 473
1 0.17 0.04 0.06 27
accuracy 0.94 500
macro avg 0.56 0.51 0.51 500
weighted avg 0.91 0.94 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.98 0.96 474
1 0.00 0.00 0.00 26
accuracy 0.93 500
macro avg 0.47 0.49 0.48 500
weighted avg 0.90 0.93 0.91 500
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for NaiveBayes-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.00 Recall (TPR): 0.00 F1 Score: 0.00 Accuracy: 0.93 Misclassification Rate: 0.07 True Positive Rate (TPR): 0.00 False Positive Rate (FPR): 0.02 False Negative Rate (FNR): 1.00 True Negative Rate (TNR): 0.98
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.43614086335605323
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
AdaBoost Classifier is tried with Logistic Regression as base estimator
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=42)
ada_clf.fit(X_train_scaled, y_train)
y_pred_train=ada_clf.predict(X_train_scaled)
y_pred_test = ada_clf.predict(X_test_scaled)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 1.00 0.97 473
1 0.00 0.00 0.00 27
accuracy 0.94 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.89 0.94 0.92 500
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.99 0.97 474
1 0.00 0.00 0.00 26
accuracy 0.94 500
macro avg 0.47 0.50 0.49 500
weighted avg 0.90 0.94 0.92 500
Training the model using default parameters shows that the model has very high bias. As the model has a f1 score of 0 in train data and test data as in both train and test data the model classifies all transactions as non fraudulent transactions
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__C': [1.0,0,0.01,0.1,0.001],
'ada__base_estimator__penalty': ['l2','l1','elasticnet'],
'ada__base_estimator__class_weight':[None,'balanced']
}
# Create a pipeline with standardization and AdaBoost Classifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),random_state=45))
])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV
grid_search_ada_v1 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v1.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__C': [1.0, 0, 0.01, 0.1, 0.001],
'ada__base_estimator__class_weight': [None,
'balanced'],
'ada__base_estimator__penalty': ['l2', 'l1',
'elasticnet'],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('scaler', StandardScaler()),
('ada',
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45))])StandardScaler()
AdaBoostClassifier(base_estimator=LogisticRegression(random_state=45),
random_state=45)LogisticRegression(random_state=45)
LogisticRegression(random_state=45)
grid_search_ada_v1.best_params_
{'ada__base_estimator__C': 0.001,
'ada__base_estimator__class_weight': 'balanced',
'ada__base_estimator__penalty': 'l2',
'ada__learning_rate': 0.1,
'ada__n_estimators': 100}
best_model = grid_search_ada_v1.best_estimator_
y_pred_train = best_model.predict(X_train_scaled)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test_scaled)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.0979020979020979 Test F1 Score: 0.03418803418803419
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 0.95 0.77 0.85 473
1 0.06 0.26 0.10 27
accuracy 0.74 500
macro avg 0.50 0.51 0.47 500
weighted avg 0.90 0.74 0.81 500
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.81 0.87 474
1 0.02 0.08 0.03 26
accuracy 0.77 500
macro avg 0.48 0.44 0.45 500
weighted avg 0.89 0.77 0.83 500
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.02 Recall (TPR): 0.08 F1 Score: 0.03 Accuracy: 0.77 Misclassification Rate: 0.23 True Positive Rate (TPR): 0.08 False Positive Rate (FPR): 0.19 False Negative Rate (FNR): 0.92 True Negative Rate (TNR): 0.81
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.46502758844530995
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Using the column list before removing features with inter correlation as tree based models are not affected by the intercorrelation
X=categorical_df[column_list]
Y=categorical_df['Is Fraudulent']
## Splitting the data into train and test - 80% for training and 20% for testing.
## As from EDA the dataset has a huge imbalance between the classes so stratify option is set to Y
## -- to use stratified random sampling in splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=Y)
##Initializing the Decision Tree Classifier of sklearn library
model=DecisionTreeClassifier(random_state=45)
model.fit(X_train, y_train)
y_pred_train=model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.95 0.95 189
1 0.00 0.00 0.00 11
accuracy 0.90 200
macro avg 0.47 0.48 0.47 200
weighted avg 0.89 0.90 0.90 200
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.0
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
X_train.columns
Index(['Location_Germany', 'Location_France', 'MCC Category_Healthcare',
'MCC Category_Clothing', 'Location_UK', 'Previous Transactions',
'Merchant Location History', 'Credit Score', 'balance_income_ratio',
'Location_Canada', 'Balance Before Transaction', 'geo_spending_profile',
'Spending Patterns', 'credit_merchant_risk',
'merchant_reputation_encoded', 'Card Limit', 'OT_Frequency_encoded',
'MCC Category_Travel', 'Card Type_Credit', 'Customer Income',
'time_based_interaction', 'Device_Desktop', 'Day', 'Card Type_Prepaid',
'Customer Age', 'Time of Day', 'spending_pattern_score',
'risk_adjusted_limit', 'Month', 'MCC Category_Groceries', 'Velocity',
'Location_US', 'Device_POS', 'MCC Category_Electronics', 'risk_score',
'MCC Category_Restaurants', 'Device_Mobile',
'MCC Category_Entertainment', 'Card Type_Debit'],
dtype='object')
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
#Creating the parameter space
param_grid= {
'classifier__criterion': ['gini', 'entropy'], # Criterion for measuring quality of splits
'classifier__max_depth': [5, 10, 20, 30, 40, 50], # Maximum depth of the tree
'classifier__min_samples_split': [2, 5, 10,15], # Minimum number of samples required to split an internal node
'classifier__min_samples_leaf': [1, 2, 4,5], # Minimum number of samples required to be at a leaf node
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]#Class weight can be passed as dictionary mentioning the weight of each class
}
# Create a pipeline with Decision Tree Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('classifier', DecisionTreeClassifier(random_state=45))
])
#Intialize Stratified K fold cross validation with folds as 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV and the metric to be tuned=f1
grid_search_dt = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_dt.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('classifier',
DecisionTreeClassifier(random_state=45))]),
n_jobs=-1,
param_grid={'classifier__class_weight': [None, {0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [5, 10, 20, 30, 40, 50],
'classifier__max_features': [None, 'sqrt', 'log2'],
'classifier__min_samples_leaf': [1, 2, 4, 5],
'classifier__min_samples_split': [2, 5, 10, 15]},
scoring='f1')Pipeline(steps=[('classifier', DecisionTreeClassifier(random_state=45))])DecisionTreeClassifier(random_state=45)
from sklearn.metrics import f1_score
best_model = grid_search_dt.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.34854771784232363 Test F1 Score: 0.22950819672131148
grid_search_dt.best_params_
{'classifier__class_weight': {0: 1, 1: 1000},
'classifier__criterion': 'entropy',
'classifier__max_depth': 20,
'classifier__max_features': 'sqrt',
'classifier__min_samples_leaf': 1,
'classifier__min_samples_split': 2}
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.79 0.88 758
1 0.21 1.00 0.35 42
accuracy 0.80 800
macro avg 0.61 0.90 0.62 800
weighted avg 0.96 0.80 0.86 800
Classification Report of Test Data
precision recall f1-score support
0 0.97 0.77 0.86 189
1 0.14 0.64 0.23 11
accuracy 0.77 200
macro avg 0.56 0.70 0.55 200
weighted avg 0.93 0.77 0.83 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for Decision Tree-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.14 Recall (TPR): 0.64 F1 Score: 0.23 Accuracy: 0.77 Misclassification Rate: 0.23 True Positive Rate (TPR): 0.64 False Positive Rate (FPR): 0.23 False Negative Rate (FNR): 0.36 True Negative Rate (TNR): 0.77
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.6960076960076961
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#Initializing random Forest classifier of sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=45)
rf_clf.fit(X_train, y_train)
y_pred_train=rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 1.00 0.97 189
1 0.00 0.00 0.00 11
accuracy 0.94 200
macro avg 0.47 0.50 0.49 200
weighted avg 0.89 0.94 0.92 200
print(f1_score(y_train,y_pred_train))
print(f1_score(y_test,y_pred_test))
1.0 0.0
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
#Setting the parameter space for hyperparameter tuning
#RandomizedSearchCV is used as the number of hyperparameter combinations is very high so the model takes more time when using GridSearch
#Trying a subset of hyperparameter combinations using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid= {
'rf__n_estimators': [50,100,200],
'rf__max_depth': [5,10, 15,20],
'rf__min_samples_split': [2,5,8,10,11],
'rf__min_samples_leaf':[1,2,3,4,5],
'rf__max_features': ['auto', 'sqrt', 'log2'],
'rf__bootstrap': [True, False],
'rf__class_weight':[None,'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with Random Forest Classifier
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('rf', RandomForestClassifier(random_state=45))
])
#Initialize Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize RandomizedSearchCV with metric to be tuned as f1
grid_search_rf = RandomizedSearchCV(estimator=pipeline,param_distributions=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('rf',
RandomForestClassifier(random_state=45))]),
n_jobs=-1,
param_distributions={'rf__bootstrap': [True, False],
'rf__class_weight': [None, 'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'rf__max_depth': [5, 10, 15, 20],
'rf__max_features': ['auto', 'sqrt',
'log2'],
'rf__min_samples_leaf': [1, 2, 3, 4, 5],
'rf__min_samples_split': [2, 5, 8, 10,
11],
'rf__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('rf', RandomForestClassifier(random_state=45))])RandomForestClassifier(random_state=45)
#Best parameter combination
grid_search_rf.best_params_
{'rf__n_estimators': 100,
'rf__min_samples_split': 10,
'rf__min_samples_leaf': 3,
'rf__max_features': 'sqrt',
'rf__max_depth': 5,
'rf__class_weight': {0: 1, 1: 1000},
'rf__bootstrap': False}
#Evaluating the best parameter combination
best_model = grid_search_rf.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 0.12103746397694524 Test F1 Score: 0.09782608695652173
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 0.20 0.33 758
1 0.06 1.00 0.12 42
accuracy 0.24 800
macro avg 0.53 0.60 0.22 800
weighted avg 0.95 0.24 0.32 800
Classification Report of Test Data
precision recall f1-score support
0 0.93 0.13 0.23 189
1 0.05 0.82 0.10 11
accuracy 0.17 200
macro avg 0.49 0.48 0.16 200
weighted avg 0.88 0.17 0.22 200
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for RandomForest-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
# Print results
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.05 Recall (TPR): 0.82 F1 Score: 0.10 Accuracy: 0.17 Misclassification Rate: 0.83 True Positive Rate (TPR): 0.82 False Positive Rate (FPR): 0.87 False Negative Rate (FNR): 0.18 True Negative Rate (TNR): 0.13
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve-Random Forest')
plt.legend()
plt.show()
AUC-ROC: 0.5300625300625301
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
#AdaBoost Classifier with decision tree as base model for the boosting rounds
from sklearn.ensemble import AdaBoostClassifier
ada_clf=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
#Training the model
ada_clf.fit(X_train, y_train)
y_pred_train=ada_clf.predict(X_train)
y_pred_test = ada_clf.predict(X_test)
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred_test)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.94 0.93 0.93 189
1 0.00 0.00 0.00 11
accuracy 0.88 200
macro avg 0.47 0.46 0.47 200
weighted avg 0.89 0.88 0.88 200
Training the model using default parameters shows that the model has very high variance. As the model has a f1 score of 1 in train data but .93 and 0 in test data as in test data the model classifies all transactions as non fraudulent transactions
##Creating the parameter space for the ada boost algorithm
param_grid= {
'ada__n_estimators': [50, 100, 200],
'ada__learning_rate': [0.01, 0.1, 1.0,2.0],
'ada__base_estimator__max_depth': [2,3,4,5],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__class_weight':[None,{0:1,1:50},'balanced',{0:1,1:100},{0:1,1:1000}]
}
# Create a pipeline with AdaBoost Classifier. Standardization is not required for tree based model
pipeline = Pipeline([
# ('scaler', StandardScaler()),
('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),random_state=45))
])
#Initializing Stratified K fold cross validation with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Initialize GridSearchCV with tuning parameter as f1 score
grid_search_ada_v2 = GridSearchCV(estimator=pipeline,param_grid=param_grid,cv=skf,n_jobs=-1,scoring='f1')
# Fit GridSearchCV
grid_search_ada_v2.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))]),
n_jobs=-1,
param_grid={'ada__base_estimator__class_weight': [None,
{0: 1, 1: 50},
'balanced',
{0: 1, 1: 100},
{0: 1, 1: 1000}],
'ada__base_estimator__criterion': ['gini', 'entropy'],
'ada__base_estimator__max_depth': [2, 3, 4, 5],
'ada__learning_rate': [0.01, 0.1, 1.0, 2.0],
'ada__n_estimators': [50, 100, 200]},
scoring='f1')Pipeline(steps=[('ada',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45))])AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=45),
random_state=45)DecisionTreeClassifier(random_state=45)
DecisionTreeClassifier(random_state=45)
#Best parameter combinations
grid_search_ada_v2.best_params_
{'ada__base_estimator__class_weight': {0: 1, 1: 100},
'ada__base_estimator__criterion': 'gini',
'ada__base_estimator__max_depth': 2,
'ada__learning_rate': 1.0,
'ada__n_estimators': 50}
#Evaluating the best hyperparameter combination
best_model = grid_search_ada_v2.best_estimator_
y_pred_train = best_model.predict(X_train)
train_f1 = f1_score(y_train, y_pred_train)
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
Train F1 Score: 1.0 Test F1 Score: 0.19999999999999998
print("Classification Report of Training Data")
report = classification_report(y_train, y_pred_train)
print(report)
print("Classification Report of Test Data")
report = classification_report(y_test, y_pred)
print(report)
Classification Report of Training Data
precision recall f1-score support
0 1.00 1.00 1.00 758
1 1.00 1.00 1.00 42
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
Classification Report of Test Data
precision recall f1-score support
0 0.95 0.96 0.96 189
1 0.22 0.18 0.20 11
accuracy 0.92 200
macro avg 0.59 0.57 0.58 200
weighted avg 0.91 0.92 0.92 200
##Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Train Data')
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for AdaBoost-Test Data')
plt.show()
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Precision, Recall, F1, Accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
# TPR, FPR, FNR, TNR
tpr = tp / (tp + fn) # True Positive Rate (Sensitivity or Recall)
fpr = fp / (fp + tn) # False Positive Rate (1 - Specificity)
fnr = fn / (fn + tp) # False Negative Rate
tnr = tn / (tn + fp) # True Negative Rate (Specificity)
print(f"Precision: {precision:.2f}")
print(f"Recall (TPR): {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Misclassification Rate: {misclassification_rate:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")
Precision: 0.22 Recall (TPR): 0.18 F1 Score: 0.20 Accuracy: 0.92 Misclassification Rate: 0.08 True Positive Rate (TPR): 0.18 False Positive Rate (FPR): 0.04 False Negative Rate (FNR): 0.82 True Negative Rate (TNR): 0.96
##ROC CURVE
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f'AUC-ROC: {auc}')
#Plotting the ROC Curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='ROC-AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
AUC-ROC: 0.6137566137566137
Evaluation of best parameters obtained from Grid Search
Comparison of Model Before and After Tuning:
Overall Performance: -Comparing All algorithms
#The hyperparameter combination that acheived good results is used
tree_model=DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)
tree_model.fit(X_train,y_train)
DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. DecisionTreeClassifier(class_weight={0: 1, 1: 1000}, criterion='entropy',
max_depth=20, max_features='sqrt', random_state=45)##Visualizing the decision tree
from sklearn.tree import plot_tree
plt.figure(figsize=(15,10)) # Set the figure size
plot_tree(tree_model, filled=True, feature_names=column_list2, class_names=categorical_df['Is Fraudulent'].astype(str))
plt.title("Decision Tree")
plt.show()
# 2. Plot Feature Importances
# Get feature importances
importances = tree_model.feature_importances_
# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
'Feature': column_list,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(8,6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette="viridis")
plt.title("Feature Importance in Decision Tree Classifier")
plt.show()
Velocity holds the highest feature importance followed by Customer Age and Day